library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.6
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidytext)
theme_set(theme_light())
#install.packages('schrute')

library(schrute)
office_df <- theoffice
glimpse(office_df)
## Rows: 55,130
## Columns: 12
## $ index            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ season           <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ episode_name     <chr> "Pilot", "Pilot", "Pilot", "Pilot", "Pilot", "Pilot",…
## $ director         <chr> "Ken Kwapis", "Ken Kwapis", "Ken Kwapis", "Ken Kwapis…
## $ writer           <chr> "Ricky Gervais;Stephen Merchant;Greg Daniels", "Ricky…
## $ character        <chr> "Michael", "Jim", "Michael", "Jim", "Michael", "Micha…
## $ text             <chr> "All right Jim. Your quarterlies look very good. How …
## $ text_w_direction <chr> "All right Jim. Your quarterlies look very good. How …
## $ imdb_rating      <dbl> 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6, 7.6…
## $ total_votes      <int> 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706, 3706,…
## $ air_date         <fct> 2005-03-24, 2005-03-24, 2005-03-24, 2005-03-24, 2005-…
office_df %>% 
  slice(1:5) %>%
  knitr::kable()
index season episode episode_name director writer character text text_w_direction imdb_rating total_votes air_date
1 1 1 Pilot Ken Kwapis Ricky Gervais;Stephen Merchant;Greg Daniels Michael All right Jim. Your quarterlies look very good. How are things at the library? All right Jim. Your quarterlies look very good. How are things at the library? 7.6 3706 2005-03-24
2 1 1 Pilot Ken Kwapis Ricky Gervais;Stephen Merchant;Greg Daniels Jim Oh, I told you. I couldn’t close it. So… Oh, I told you. I couldn’t close it. So… 7.6 3706 2005-03-24
3 1 1 Pilot Ken Kwapis Ricky Gervais;Stephen Merchant;Greg Daniels Michael So you’ve come to the master for guidance? Is this what you’re saying, grasshopper? So you’ve come to the master for guidance? Is this what you’re saying, grasshopper? 7.6 3706 2005-03-24
4 1 1 Pilot Ken Kwapis Ricky Gervais;Stephen Merchant;Greg Daniels Jim Actually, you called me in here, but yeah. Actually, you called me in here, but yeah. 7.6 3706 2005-03-24
5 1 1 Pilot Ken Kwapis Ricky Gervais;Stephen Merchant;Greg Daniels Michael All right. Well, let me show you how it’s done. All right. Well, let me show you how it’s done. 7.6 3706 2005-03-24
office_df %>% 
  group_by(season) %>% 
  mutate(avgIMDB = mean(imdb_rating)) %>% 
  ggplot(aes(season, imdb_rating, group = season, color = factor(season))) +
  geom_boxplot() +
  theme(legend.position = 'none') +
  labs(title = "Average IMDB rading by season",
       x = "Season", 
       y = "AVG IMDB rating") +
  scale_x_continuous(breaks = 1:9)

top_characters <- office_df %>% 
  count(character, sort = T) %>% 
  slice(1:20) %>% 
  mutate(character = fct_reorder(character, n))

top_characters %>% 
  ggplot(aes(n, character, fill = n)) +
  geom_col() +
  theme(legend.position = 'none')

office <- office_df %>% 
  select(season, episode, episode_name, imdb_rating, total_votes, air_date) %>%
  mutate(air_date = as.Date(air_date)) %>% 
  distinct(., air_date, .keep_all = TRUE)

p <- office %>% 
  mutate(episode_num = row_number()) %>% 
  ggplot(aes(episode_num, imdb_rating, group = 1, color = factor(season))) +
  geom_line() + 
  geom_smooth(se = FALSE) +
  geom_point(aes(size = total_votes)) +
  #geom_text(aes(label = episode_name), check_overlap = TRUE, hjust = 1) +
  theme(legend.position = 'none') +
  labs(title = 'Complete IMDB ratings for the office',
       subtitle = 'point size ~ # of ratings',
       x = 'Episode',
       y = 'IMDB rating') +
  ggplot2::aes(text = paste0('Episode name: ', episode_name,
                             '\nIMDB rating: ', imdb_rating,
                             '\ntotal votes: ', format(total_votes, big.mark = ','),
                             '\nseason: ', season)
  )

plotly::ggplotly(p, tooltip = 'text')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
junk_words <- c('yeah', 'hey', 'uh', 'um')
names_blacklist <- top_characters %>% 
  select(character) %>% 
  as.character()

words_df <- office_df %>% 
  unnest_tokens('word', 'text') %>%
  inner_join(top_characters %>% 
               select(character), by = 'character') %>% 
  anti_join(stop_words, by = 'word') %>%
  filter(!word %in% junk_words,
         !word %in% names_blacklist) %>% 
  count(character, word) %>% 
  group_by(character) %>% 
  slice_max(n, n = 8) 

words_df %>% 
  slice(1:5) %>%
  knitr::kable()
character word n
Andy gonna 130
Andy guys 118
Andy time 102
Andy dwight 80
Andy erin 78
Angela dwight 81
Angela kevin 40
Angela god 38
Angela pam 31
Angela party 31
Darryl na 48
Darryl mike 46
Darryl gonna 43
Darryl cool 23
Darryl time 23
Dwight jim 294
Dwight michael 282
Dwight gonna 150
Dwight time 140
Dwight wait 117
Erin andy 92
Erin guys 39
Erin michael 37
Erin pam 28
Erin gabe 26
Gabe erin 19
Gabe gabe 17
Gabe gonna 14
Gabe guys 13
Gabe michael 12
Holly michael 35
Holly love 22
Holly gonna 14
Holly time 13
Holly talk 11
Jan michael 159
Jan branch 17
Jan gonna 17
Jan time 16
Jan pam 14
Jim dwight 258
Jim gonna 199
Jim pam 189
Jim michael 181
Jim alright 133
Kelly god 50
Kelly ryan 44
Kelly guys 31
Kelly gonna 26
Kelly michael 26
Kevin michael 71
Kevin jim 39
Kevin pam 39
Kevin oscar 36
Kevin guys 33
Meredith wait 23
Meredith gonna 12
Meredith minute 11
Meredith guys 10
Meredith meredith 10
Michael dwight 353
Michael people 336
Michael pam 327
Michael gonna 287
Michael time 275
Nellie dwight 38
Nellie andy 17
Nellie pam 17
Nellie gonna 13
Nellie jim 13
Oscar michael 65
Oscar angela 37
Oscar kevin 34
Oscar gay 33
Oscar andy 29
Pam michael 262
Pam jim 182
Pam dwight 126
Pam gonna 116
Pam time 93
Phyllis michael 59
Phyllis dwight 34
Phyllis andy 26
Phyllis bob 24
Phyllis time 23
Ryan michael 43
Ryan kelly 41
Ryan people 25
Ryan time 25
Ryan guys 24
Stanley michael 27
Stanley day 15
Stanley gonna 14
Stanley mind 14
Stanley christmas 13
Toby michael 50
Toby gonna 26
Toby talk 26
Toby guys 19
Toby time 15
words_df %>% 
  mutate(word = reorder_within(word, n, character)) %>% 
  ggplot(aes(x = word, y = n, fill = character)) +
  geom_col() + 
  scale_x_reordered() + 
  coord_flip() + 
  facet_wrap(~character, scales = "free") + 
  theme(legend.position = "none")

words_df %>% 
  bind_tf_idf(word, character, n) %>% 
  group_by(character) %>% 
  top_n(tf_idf, n = 5) %>% 
  ungroup() %>% 
  ggplot(aes(x = reorder_within(word, tf_idf, character), y = tf_idf, fill = character)) +
  geom_col() + 
  scale_x_reordered() + 
  coord_flip() + 
  facet_wrap(~character, scales = "free") + 
  theme(legend.position = "none") +
  labs(x = "",
       y = "TF-IDF of character-word pairs")

office_df %>% 
  unnest_tokens('word', 'text') %>%
  inner_join(top_characters %>% 
               select(character), by = 'character') %>% 
  anti_join(stop_words, by = 'word') %>%
  filter(!word %in% junk_words,
         !word %in% names_blacklist) %>% 
  select(season, word) %>% 
  inner_join(get_sentiments(lexicon = "bing")) %>% 
  group_by(season) %>% 
  count(sentiment) %>% 
  ungroup() %>% 
  ggplot(aes(x = season, y = n, fill = sentiment, group = sentiment)) + 
  geom_area() +
  scale_x_continuous(breaks = 1:9) +
  labs(title = 'Area plot of positive and negative sentiments over seasons')
## Joining, by = "word"